1.Intro:

Features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass. They describe characteristics of the cell nuclei present in the image. source: https://www.kaggle.com/uciml/breast-cancer-wisconsin-data

2. Data Importing & Cleaning & Inspecting

2-1) Import dataset

wbcd means ‘wisconsin breast cancer data’

wbcd <- read.csv("breast_cancer.csv", header=T, stringsAsFactors=F)

2-2) Remove NULL Data

wbcd$X <- NULL

2-3) Reshape the datasets

wbcd <- wbcd[,-1]
wbcd$diagnosis <- factor(ifelse(wbcd$diagnosis=="B","Benign","Malignant"))

2-4) Inspect the datasets

structure

str(wbcd)
## 'data.frame':    569 obs. of  31 variables:
##  $ diagnosis              : Factor w/ 2 levels "Benign","Malignant": 2 2 2 2 2 2 2 2 2 2 ...
##  $ radius_mean            : num  18 20.6 19.7 11.4 20.3 ...
##  $ texture_mean           : num  10.4 17.8 21.2 20.4 14.3 ...
##  $ perimeter_mean         : num  122.8 132.9 130 77.6 135.1 ...
##  $ area_mean              : num  1001 1326 1203 386 1297 ...
##  $ smoothness_mean        : num  0.1184 0.0847 0.1096 0.1425 0.1003 ...
##  $ compactness_mean       : num  0.2776 0.0786 0.1599 0.2839 0.1328 ...
##  $ concavity_mean         : num  0.3001 0.0869 0.1974 0.2414 0.198 ...
##  $ concave.points_mean    : num  0.1471 0.0702 0.1279 0.1052 0.1043 ...
##  $ symmetry_mean          : num  0.242 0.181 0.207 0.26 0.181 ...
##  $ fractal_dimension_mean : num  0.0787 0.0567 0.06 0.0974 0.0588 ...
##  $ radius_se              : num  1.095 0.543 0.746 0.496 0.757 ...
##  $ texture_se             : num  0.905 0.734 0.787 1.156 0.781 ...
##  $ perimeter_se           : num  8.59 3.4 4.58 3.44 5.44 ...
##  $ area_se                : num  153.4 74.1 94 27.2 94.4 ...
##  $ smoothness_se          : num  0.0064 0.00522 0.00615 0.00911 0.01149 ...
##  $ compactness_se         : num  0.049 0.0131 0.0401 0.0746 0.0246 ...
##  $ concavity_se           : num  0.0537 0.0186 0.0383 0.0566 0.0569 ...
##  $ concave.points_se      : num  0.0159 0.0134 0.0206 0.0187 0.0188 ...
##  $ symmetry_se            : num  0.03 0.0139 0.0225 0.0596 0.0176 ...
##  $ fractal_dimension_se   : num  0.00619 0.00353 0.00457 0.00921 0.00511 ...
##  $ radius_worst           : num  25.4 25 23.6 14.9 22.5 ...
##  $ texture_worst          : num  17.3 23.4 25.5 26.5 16.7 ...
##  $ perimeter_worst        : num  184.6 158.8 152.5 98.9 152.2 ...
##  $ area_worst             : num  2019 1956 1709 568 1575 ...
##  $ smoothness_worst       : num  0.162 0.124 0.144 0.21 0.137 ...
##  $ compactness_worst      : num  0.666 0.187 0.424 0.866 0.205 ...
##  $ concavity_worst        : num  0.712 0.242 0.45 0.687 0.4 ...
##  $ concave.points_worst   : num  0.265 0.186 0.243 0.258 0.163 ...
##  $ symmetry_worst         : num  0.46 0.275 0.361 0.664 0.236 ...
##  $ fractal_dimension_worst: num  0.1189 0.089 0.0876 0.173 0.0768 ...

summary

summary(wbcd)
##      diagnosis    radius_mean      texture_mean   perimeter_mean  
##  Benign   :357   Min.   : 6.981   Min.   : 9.71   Min.   : 43.79  
##  Malignant:212   1st Qu.:11.700   1st Qu.:16.17   1st Qu.: 75.17  
##                  Median :13.370   Median :18.84   Median : 86.24  
##                  Mean   :14.127   Mean   :19.29   Mean   : 91.97  
##                  3rd Qu.:15.780   3rd Qu.:21.80   3rd Qu.:104.10  
##                  Max.   :28.110   Max.   :39.28   Max.   :188.50  
##    area_mean      smoothness_mean   compactness_mean  concavity_mean   
##  Min.   : 143.5   Min.   :0.05263   Min.   :0.01938   Min.   :0.00000  
##  1st Qu.: 420.3   1st Qu.:0.08637   1st Qu.:0.06492   1st Qu.:0.02956  
##  Median : 551.1   Median :0.09587   Median :0.09263   Median :0.06154  
##  Mean   : 654.9   Mean   :0.09636   Mean   :0.10434   Mean   :0.08880  
##  3rd Qu.: 782.7   3rd Qu.:0.10530   3rd Qu.:0.13040   3rd Qu.:0.13070  
##  Max.   :2501.0   Max.   :0.16340   Max.   :0.34540   Max.   :0.42680  
##  concave.points_mean symmetry_mean    fractal_dimension_mean
##  Min.   :0.00000     Min.   :0.1060   Min.   :0.04996       
##  1st Qu.:0.02031     1st Qu.:0.1619   1st Qu.:0.05770       
##  Median :0.03350     Median :0.1792   Median :0.06154       
##  Mean   :0.04892     Mean   :0.1812   Mean   :0.06280       
##  3rd Qu.:0.07400     3rd Qu.:0.1957   3rd Qu.:0.06612       
##  Max.   :0.20120     Max.   :0.3040   Max.   :0.09744       
##    radius_se        texture_se      perimeter_se       area_se       
##  Min.   :0.1115   Min.   :0.3602   Min.   : 0.757   Min.   :  6.802  
##  1st Qu.:0.2324   1st Qu.:0.8339   1st Qu.: 1.606   1st Qu.: 17.850  
##  Median :0.3242   Median :1.1080   Median : 2.287   Median : 24.530  
##  Mean   :0.4052   Mean   :1.2169   Mean   : 2.866   Mean   : 40.337  
##  3rd Qu.:0.4789   3rd Qu.:1.4740   3rd Qu.: 3.357   3rd Qu.: 45.190  
##  Max.   :2.8730   Max.   :4.8850   Max.   :21.980   Max.   :542.200  
##  smoothness_se      compactness_se      concavity_se    
##  Min.   :0.001713   Min.   :0.002252   Min.   :0.00000  
##  1st Qu.:0.005169   1st Qu.:0.013080   1st Qu.:0.01509  
##  Median :0.006380   Median :0.020450   Median :0.02589  
##  Mean   :0.007041   Mean   :0.025478   Mean   :0.03189  
##  3rd Qu.:0.008146   3rd Qu.:0.032450   3rd Qu.:0.04205  
##  Max.   :0.031130   Max.   :0.135400   Max.   :0.39600  
##  concave.points_se   symmetry_se       fractal_dimension_se
##  Min.   :0.000000   Min.   :0.007882   Min.   :0.0008948   
##  1st Qu.:0.007638   1st Qu.:0.015160   1st Qu.:0.0022480   
##  Median :0.010930   Median :0.018730   Median :0.0031870   
##  Mean   :0.011796   Mean   :0.020542   Mean   :0.0037949   
##  3rd Qu.:0.014710   3rd Qu.:0.023480   3rd Qu.:0.0045580   
##  Max.   :0.052790   Max.   :0.078950   Max.   :0.0298400   
##   radius_worst   texture_worst   perimeter_worst    area_worst    
##  Min.   : 7.93   Min.   :12.02   Min.   : 50.41   Min.   : 185.2  
##  1st Qu.:13.01   1st Qu.:21.08   1st Qu.: 84.11   1st Qu.: 515.3  
##  Median :14.97   Median :25.41   Median : 97.66   Median : 686.5  
##  Mean   :16.27   Mean   :25.68   Mean   :107.26   Mean   : 880.6  
##  3rd Qu.:18.79   3rd Qu.:29.72   3rd Qu.:125.40   3rd Qu.:1084.0  
##  Max.   :36.04   Max.   :49.54   Max.   :251.20   Max.   :4254.0  
##  smoothness_worst  compactness_worst concavity_worst  concave.points_worst
##  Min.   :0.07117   Min.   :0.02729   Min.   :0.0000   Min.   :0.00000     
##  1st Qu.:0.11660   1st Qu.:0.14720   1st Qu.:0.1145   1st Qu.:0.06493     
##  Median :0.13130   Median :0.21190   Median :0.2267   Median :0.09993     
##  Mean   :0.13237   Mean   :0.25427   Mean   :0.2722   Mean   :0.11461     
##  3rd Qu.:0.14600   3rd Qu.:0.33910   3rd Qu.:0.3829   3rd Qu.:0.16140     
##  Max.   :0.22260   Max.   :1.05800   Max.   :1.2520   Max.   :0.29100     
##  symmetry_worst   fractal_dimension_worst
##  Min.   :0.1565   Min.   :0.05504        
##  1st Qu.:0.2504   1st Qu.:0.07146        
##  Median :0.2822   Median :0.08004        
##  Mean   :0.2901   Mean   :0.08395        
##  3rd Qu.:0.3179   3rd Qu.:0.09208        
##  Max.   :0.6638   Max.   :0.20750

3. Analyze the Correlation between variables

3-1) Correlation between each variables

There are many ways to draw a correalation plot!

For practice, I applied different function to each data (mean, se, worst)

Mean

library(PerformanceAnalytics)
chart.Correlation(wbcd[,c(2:11)],histogram=TRUE, col="grey10", pch=1, main="Cancer Mean")

SE

library(psych)
pairs.panels(wbcd[,c(12:21)], ellipses=TRUE, pch=1, lm=TRUE, cex.cor=1, smoother=F, stars = T, main="Cancer SE")

Worst

library(ggplot2)
library(GGally)
ggpairs(wbcd[,c(22:31)])+ theme_bw()+
labs(title="Cancer Worst")+
theme(plot.title=element_text(face='bold',color='black',hjust=0.5,size=13))

3-2) See the relation between each variables (diagnosis included)

I think viewing plot with diagnosis included is much more important than combined data[3-1].

library(ggplot2)
library(GGally)

Mean

ggpairs(wbcd[,c(2:11,1)], aes(color=diagnosis, alpha=0.75), lower=list(continuous="smooth"))+ theme_bw()+
labs(title="Cancer Mean")+
theme(plot.title=element_text(face='bold',color='black',hjust=0.5,size=12))

SE

ggpairs(wbcd[,c(12:21,1)], aes(color=diagnosis, alpha=0.75), lower=list(continuous="smooth"))+ theme_bw()+
labs(title="Cancer SE")+
theme(plot.title=element_text(face='bold',color='black',hjust=0.5,size=12))

Worst

ggpairs(wbcd[,c(22:31,1)], aes(color=diagnosis, alpha=0.75), lower=list(continuous="smooth"))+ theme_bw()+
labs(title="Cancer Worst")+
theme(plot.title=element_text(face='bold',color='black',hjust=0.5,size=12))

3-3) See the ggcorr plot

By ggcorr, we can see the correlation value more directly than above graph.

Mean

ggcorr(wbcd[,c(2:11)], name = "corr", label = TRUE)+
  theme(legend.position="none")+
labs(title="Cancer Mean")+
theme(plot.title=element_text(face='bold',color='black',hjust=0.5,size=12))

SE

ggcorr(wbcd[,c(12:21)], name = "corr", label = TRUE)+
  theme(legend.position="none")+
labs(title="Cancer SE")+
theme(plot.title=element_text(face='bold',color='black',hjust=0.5,size=12))

Worst

ggcorr(wbcd[,c(22:31)], name = "corr", label = TRUE)+
  theme(legend.position="none")+
labs(title="Cancer Worst")+
theme(plot.title=element_text(face='bold',color='black',hjust=0.5,size=12))

3-4) Principal Component Analysis (PCA)

Too many variables can cause such problems below

  • Increased computer throughput

  • Too complex visualization problems

  • Decrease efficiency by including variables that have no effect on the analysis

  • Make data interpretation difficult

If you see the ggcorr plot above[3-3], high correlation value means it has “multicollinearity” between variables.

-> Use one main component for model development by reduct the variables with high correlation.

PCA uses standardized data so that it can avoid data distortion caused by scale difference

wbcd_pca <- transform(wbcd) 

All

all_prcomp <- prcomp(wbcd_pca[,-1], scale = TRUE)
summary(all_prcomp)
## Importance of components%s:
##                           PC1    PC2     PC3     PC4     PC5     PC6
## Standard deviation     3.6444 2.3857 1.67867 1.40735 1.28403 1.09880
## Proportion of Variance 0.4427 0.1897 0.09393 0.06602 0.05496 0.04025
## Cumulative Proportion  0.4427 0.6324 0.72636 0.79239 0.84734 0.88759
##                            PC7     PC8    PC9    PC10   PC11    PC12
## Standard deviation     0.82172 0.69037 0.6457 0.59219 0.5421 0.51104
## Proportion of Variance 0.02251 0.01589 0.0139 0.01169 0.0098 0.00871
## Cumulative Proportion  0.91010 0.92598 0.9399 0.95157 0.9614 0.97007
##                           PC13    PC14    PC15    PC16    PC17    PC18
## Standard deviation     0.49128 0.39624 0.30681 0.28260 0.24372 0.22939
## Proportion of Variance 0.00805 0.00523 0.00314 0.00266 0.00198 0.00175
## Cumulative Proportion  0.97812 0.98335 0.98649 0.98915 0.99113 0.99288
##                           PC19    PC20   PC21    PC22    PC23   PC24
## Standard deviation     0.22244 0.17652 0.1731 0.16565 0.15602 0.1344
## Proportion of Variance 0.00165 0.00104 0.0010 0.00091 0.00081 0.0006
## Cumulative Proportion  0.99453 0.99557 0.9966 0.99749 0.99830 0.9989
##                           PC25    PC26    PC27    PC28    PC29    PC30
## Standard deviation     0.12442 0.09043 0.08307 0.03987 0.02736 0.01153
## Proportion of Variance 0.00052 0.00027 0.00023 0.00005 0.00002 0.00000
## Cumulative Proportion  0.99942 0.99969 0.99992 0.99997 1.00000 1.00000
screeplot(all_prcomp, npcs=10, type="lines")

Mean

mean_prcomp <- prcomp(wbcd_pca[,c(2:11)], scale = TRUE)
summary(mean_prcomp)
## Importance of components%s:
##                           PC1    PC2     PC3    PC4     PC5     PC6
## Standard deviation     2.3406 1.5870 0.93841 0.7064 0.61036 0.35234
## Proportion of Variance 0.5479 0.2519 0.08806 0.0499 0.03725 0.01241
## Cumulative Proportion  0.5479 0.7997 0.88779 0.9377 0.97495 0.98736
##                            PC7     PC8     PC9    PC10
## Standard deviation     0.28299 0.18679 0.10552 0.01680
## Proportion of Variance 0.00801 0.00349 0.00111 0.00003
## Cumulative Proportion  0.99537 0.99886 0.99997 1.00000
screeplot(mean_prcomp, npcs=4, type="lines")

print(mean_prcomp)
## Standard deviations (1, .., p=10):
##  [1] 2.34063837 1.58704555 0.93841099 0.70640600 0.61035989 0.35233755
##  [7] 0.28299348 0.18678810 0.10552469 0.01680196
## 
## Rotation (n x k) = (10 x 10):
##                                PC1          PC2         PC3          PC4
## radius_mean            -0.36393793  0.313929073 -0.12442759  0.029558858
## texture_mean           -0.15445113  0.147180909  0.95105659  0.008916084
## perimeter_mean         -0.37604434  0.284657885 -0.11408360  0.013458069
## area_mean              -0.36408585  0.304841714 -0.12337786  0.013442682
## smoothness_mean        -0.23248053 -0.401962324 -0.16653247 -0.107802033
## compactness_mean       -0.36444206 -0.266013147  0.05827786 -0.185700413
## concavity_mean         -0.39574849 -0.104285968  0.04114649 -0.166653523
## concave.points_mean    -0.41803840 -0.007183605 -0.06855383 -0.072983951
## symmetry_mean          -0.21523797 -0.368300910  0.03672364  0.892998475
## fractal_dimension_mean -0.07183744 -0.571767700  0.11358395 -0.349331790
##                                 PC5          PC6         PC7          PC8
## radius_mean            -0.031067022  0.264180150 -0.04418839  0.084834062
## texture_mean           -0.219922761  0.032206572  0.02055748 -0.007126797
## perimeter_mean         -0.005945081  0.237819464 -0.08336923  0.089258879
## area_mean              -0.019341222  0.331707454  0.26118796  0.144609749
## smoothness_mean        -0.843745292 -0.062225368  0.01129197  0.170503128
## compactness_mean        0.240182967 -0.005271104 -0.80380484  0.063980134
## concavity_mean          0.312533244 -0.601467155  0.36713629  0.449573315
## concave.points_mean    -0.009180198 -0.265613395  0.14131308 -0.850918762
## symmetry_mean           0.112888068  0.061957003  0.04790201  0.016455606
## fractal_dimension_mean  0.264878077  0.567918997  0.34521359 -0.065259461
##                                 PC9          PC10
## radius_mean             0.474425305 -0.6690714888
## texture_mean            0.004212629  0.0002497826
## perimeter_mean          0.380167210  0.7404905337
## area_mean              -0.747347357 -0.0323589585
## smoothness_mean         0.005847386  0.0036904058
## compactness_mean       -0.218732407 -0.0527527802
## concavity_mean          0.081170670 -0.0103668020
## concave.points_mean    -0.022024652 -0.0037475480
## symmetry_mean           0.009067850  0.0014669472
## fractal_dimension_mean  0.129667491  0.0070573477

SE

se_prcomp <- prcomp(wbcd_pca[,c(12:21)], scale = TRUE)
summary(se_prcomp)
## Importance of components%s:
##                           PC1    PC2    PC3     PC4     PC5     PC6
## Standard deviation     2.1779 1.4406 1.1245 0.77095 0.75991 0.57939
## Proportion of Variance 0.4743 0.2075 0.1264 0.05944 0.05775 0.03357
## Cumulative Proportion  0.4743 0.6819 0.8083 0.86774 0.92548 0.95905
##                            PC7    PC8     PC9    PC10
## Standard deviation     0.43512 0.3962 0.20436 0.14635
## Proportion of Variance 0.01893 0.0157 0.00418 0.00214
## Cumulative Proportion  0.97798 0.9937 0.99786 1.00000
screeplot(se_prcomp, npcs=4, type="lines")

print(se_prcomp)
## Standard deviations (1, .., p=10):
##  [1] 2.1779279 1.4405579 1.1244649 0.7709473 0.7599129 0.5793947 0.4351151
##  [8] 0.3961933 0.2043629 0.1463479
## 
## Rotation (n x k) = (10 x 10):
##                             PC1         PC2         PC3         PC4
## radius_se            -0.3455917  0.44035402  0.08078489  0.04864424
## texture_se           -0.1886093 -0.15339415  0.59152980 -0.26297794
## perimeter_se         -0.3574809  0.42030257  0.05877767 -0.01002982
## area_se              -0.3040197  0.50021113  0.02483694  0.07280027
## smoothness_se        -0.2124504 -0.27095295  0.42747680  0.79615347
## compactness_se       -0.3747987 -0.24262835 -0.25680860 -0.08700675
## concavity_se         -0.3555528 -0.22912114 -0.33819846 -0.10133141
## concave.points_se    -0.3857430 -0.08499145 -0.22956424  0.04019019
## symmetry_se          -0.2363156 -0.19857350  0.43932087 -0.51576918
## fractal_dimension_se -0.3287895 -0.35250198 -0.17529054  0.06118979
##                              PC5          PC6          PC7         PC8
## radius_se             0.01622501 -0.088641991  0.021382456 -0.12552302
## texture_se           -0.71881713  0.009450168  0.007842011  0.04858855
## perimeter_se          0.01739386 -0.039589383 -0.100936382  0.03364523
## area_se               0.02485467 -0.143033086  0.178629163  0.06572712
## smoothness_se         0.18211162  0.089995405  0.100523214  0.11114632
## compactness_se        0.01693787 -0.214913715 -0.307501612  0.75611931
## concavity_se         -0.09471984  0.226154507  0.788497259  0.01702619
## concave.points_se    -0.08799050  0.672610241 -0.463368226 -0.29208975
## symmetry_se           0.65540293  0.074907975  0.026405642 -0.07517928
## fractal_dimension_se -0.04898647 -0.637635341 -0.124849130 -0.54872845
##                              PC9         PC10
## radius_se             0.31915726 -0.742675565
## texture_se           -0.05112734 -0.002855984
## perimeter_se          0.51822457  0.640508399
## area_se              -0.75963699  0.130732798
## smoothness_se         0.02331983  0.024217750
## compactness_se       -0.01289135 -0.119255784
## concavity_se          0.11651039  0.027004427
## concave.points_se    -0.17123533 -0.012995134
## symmetry_se          -0.06466883 -0.002715704
## fractal_dimension_se -0.04616613  0.073272411

Worst

worst_prcomp <- prcomp(wbcd_pca[,c(22:31)], scale = TRUE)
summary(worst_prcomp)
## Importance of components%s:
##                           PC1    PC2     PC3     PC4     PC5     PC6
## Standard deviation     2.3869 1.4443 0.89597 0.73531 0.71741 0.42862
## Proportion of Variance 0.5697 0.2086 0.08028 0.05407 0.05147 0.01837
## Cumulative Proportion  0.5697 0.7783 0.85860 0.91267 0.96413 0.98251
##                            PC7     PC8     PC9    PC10
## Standard deviation     0.28959 0.26802 0.12343 0.06326
## Proportion of Variance 0.00839 0.00718 0.00152 0.00040
## Cumulative Proportion  0.99089 0.99808 0.99960 1.00000
screeplot(worst_prcomp, npcs=4, type="lines")

print(worst_prcomp)
## Standard deviations (1, .., p=10):
##  [1] 2.38688848 1.44429302 0.89597293 0.73531379 0.71740732 0.42862478
##  [7] 0.28959132 0.26801978 0.12342831 0.06326496
## 
## Rotation (n x k) = (10 x 10):
##                                PC1         PC2         PC3         PC4
## radius_worst            -0.3359101  0.40313668 -0.07613333  0.07095866
## texture_worst           -0.2007314  0.04257198  0.97682386 -0.00233435
## perimeter_worst         -0.3481510  0.37551796 -0.08382037  0.03361042
## area_worst              -0.3247392  0.41525563 -0.07902211  0.06609632
## smoothness_worst        -0.2486258 -0.33786981 -0.05144303  0.31183688
## compactness_worst       -0.3645682 -0.25056608 -0.03801446 -0.26982605
## concavity_worst         -0.3747424 -0.13908482 -0.05855486 -0.32050054
## concave.points_worst    -0.3976373  0.04168507 -0.13217642 -0.05213711
## symmetry_worst          -0.2497528 -0.30860719 -0.02146397  0.77152522
## fractal_dimension_worst -0.2540829 -0.47849501 -0.03601462 -0.34456154
##                                   PC5         PC6         PC7         PC8
## radius_worst            -0.0269138039 -0.17376560  0.02580208  0.01497099
## texture_worst           -0.0290270596  0.01509874 -0.02645941 -0.04311839
## perimeter_worst          0.0006772917 -0.13172429 -0.02654558  0.09221707
## area_worst              -0.0692448712 -0.29437547  0.24876937  0.03172400
## smoothness_worst        -0.8263639733  0.07114762  0.09077120  0.16235311
## compactness_worst        0.2021719220 -0.01079188 -0.39766075  0.71502532
## concavity_worst          0.1650942746  0.53132580  0.64845080 -0.03381691
## concave.points_worst    -0.0538628284  0.39305063 -0.58217320 -0.54530352
## symmetry_worst           0.4889956064 -0.02864905  0.06597451 -0.04766924
## fractal_dimension_worst  0.0247555394 -0.65021091  0.07683158 -0.38731726
##                                  PC9         PC10
## radius_worst            -0.426117589  0.707409982
## texture_worst            0.006193392 -0.006001877
## perimeter_worst         -0.459151548 -0.701598949
## area_worst               0.745255816 -0.041754195
## smoothness_worst        -0.039457323 -0.006807917
## compactness_worst        0.121416060  0.070202394
## concavity_worst         -0.052865741  0.009177221
## concave.points_worst     0.162096574  0.003346891
## symmetry_worst           0.006068817 -0.008600691
## fractal_dimension_worst -0.082179137 -0.020161298

3-5) See the Biplot

library("factoextra")

All

fviz_pca_biplot(all_prcomp, col.ind = wbcd$diagnosis, col="black",
                palette = "jco", geom = "point", repel=TRUE,
                legend.title="Diagnosis", addEllipses = TRUE)

Mean

fviz_pca_biplot(mean_prcomp, col.ind = wbcd$diagnosis, col="black",
                palette = "jco", geom = "point", repel=TRUE,
                legend.title="Diagnosis", addEllipses = TRUE)

SE

fviz_pca_biplot(se_prcomp, col.ind = wbcd$diagnosis, col="black",
                palette = "jco", geom = "point", repel=TRUE,
                legend.title="Diagnosis", addEllipses = TRUE)

Worst

fviz_pca_biplot(worst_prcomp, col.ind = wbcd$diagnosis, col="black",
                palette = "jco", geom = "point", repel=TRUE,
                legend.title="Diagnosis", addEllipses = TRUE)

4. Apply every ML methods and compare each other and choose best fits

4-1) Make test & train dataset for testing classification ML methods

Shuffle the wbcd data(100%) & Make train dataset(70%), test dataset(30%)

wbcd1=wbcd[,c(1,8,28,15,5,3,18,11,20,6,13,21,10,26)]
nrows <- NROW(wbcd1)
# set.seed(1)               ## fix random value
index <- sample(1:nrows, 0.7 * nrows)   ## shuffle and divide

#train <- wbcd                  ## 569 test data (100%)
train <- wbcd1[index,]          ## 398 test data (70%)
test <- wbcd1[-index,]                  ## 171 test data (30%)

4-2) Check the proportion of diagnosis (Benign / Malignant)

train

prop.table(table(train$diagnosis))
## 
##    Benign Malignant 
## 0.6306533 0.3693467

test

prop.table(table(test$diagnosis))
## 
##    Benign Malignant 
##  0.619883  0.380117

4-3) Apply every ML methods(that I know) to data with the Tree based feature selection :

our features are: ‘texture_mean’, ‘area_mean’, ‘smoothness_mean’, ‘concavity_mean’, ‘symmetry_mean’, ‘fractal_dimension_mean’, ‘area_se’, ‘smoothness_se’, ‘fractal_dimension_se’, ‘smoothness_worst’, ‘concavity_worst’, ‘symmetry_worst’

C5.0

library(C50)
library(caret)
learn_c50 <- C5.0(train[,-1],train$diagnosis)
pre_c50 <- predict(learn_c50, test[,-1])
cm_c50 <- confusionMatrix(pre_c50, test$diagnosis)
cm_c50
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       102        11
##   Malignant      4        54
##                                           
##                Accuracy : 0.9123          
##                  95% CI : (0.8594, 0.9501)
##     No Information Rate : 0.6199          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8099          
##  Mcnemar's Test P-Value : 0.1213          
##                                           
##             Sensitivity : 0.9623          
##             Specificity : 0.8308          
##          Pos Pred Value : 0.9027          
##          Neg Pred Value : 0.9310          
##              Prevalence : 0.6199          
##          Detection Rate : 0.5965          
##    Detection Prevalence : 0.6608          
##       Balanced Accuracy : 0.8965          
##                                           
##        'Positive' Class : Benign          
## 

C5.0 - Tune

total_accuracy_c50 <- function(train, test){
    accuracy1 <- NULL; accuracy2 <- NULL
    for(i in 1:100){
        learn_imp_c50 <- C5.0(train[,-1],train$diagnosis,trials = i)      
        p_c50 <- predict(learn_imp_c50, test[,-1]) 
        accuracy1 <- confusionMatrix(p_c50, test$diagnosis)
        accuracy2[i] <- accuracy1$overall[1]
    }
    accuracy2
}

a <- total_accuracy_c50(train,test)
opt_trials <- which(a==max(a))[1]   
        
learn_imp_c50 <- C5.0(train[,-1],train$diagnosis,trials=opt_trials) 
pre_imp_c50 <- predict(learn_imp_c50, test[,-1])
cm_imp_c50 <- confusionMatrix(pre_imp_c50, test$diagnosis)
cm_imp_c50
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       104         7
##   Malignant      2        58
##                                           
##                Accuracy : 0.9474          
##                  95% CI : (0.9024, 0.9757)
##     No Information Rate : 0.6199          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8866          
##  Mcnemar's Test P-Value : 0.1824          
##                                           
##             Sensitivity : 0.9811          
##             Specificity : 0.8923          
##          Pos Pred Value : 0.9369          
##          Neg Pred Value : 0.9667          
##              Prevalence : 0.6199          
##          Detection Rate : 0.6082          
##    Detection Prevalence : 0.6491          
##       Balanced Accuracy : 0.9367          
##                                           
##        'Positive' Class : Benign          
## 

rpart

library(rpart)
learn_rp <- rpart(diagnosis~.,data=train,control=rpart.control(minsplit=2))
pre_rp <- predict(learn_rp, test[,-1], type="class")
cm_rp  <- confusionMatrix(pre_rp, test$diagnosis)   
cm_rp
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       104         8
##   Malignant      2        57
##                                           
##                Accuracy : 0.9415          
##                  95% CI : (0.8951, 0.9716)
##     No Information Rate : 0.6199          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8737          
##  Mcnemar's Test P-Value : 0.1138          
##                                           
##             Sensitivity : 0.9811          
##             Specificity : 0.8769          
##          Pos Pred Value : 0.9286          
##          Neg Pred Value : 0.9661          
##              Prevalence : 0.6199          
##          Detection Rate : 0.6082          
##    Detection Prevalence : 0.6550          
##       Balanced Accuracy : 0.9290          
##                                           
##        'Positive' Class : Benign          
## 

Prune

learn_pru <- prune(learn_rp, cp=learn_rp$cptable[which.min(learn_rp$cptable[,"xerror"]),"CP"])
pre_pru <- predict(learn_pru, test[,-1], type="class")
cm_pru <-confusionMatrix(pre_pru, test$diagnosis)           
cm_pru
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       104         8
##   Malignant      2        57
##                                           
##                Accuracy : 0.9415          
##                  95% CI : (0.8951, 0.9716)
##     No Information Rate : 0.6199          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8737          
##  Mcnemar's Test P-Value : 0.1138          
##                                           
##             Sensitivity : 0.9811          
##             Specificity : 0.8769          
##          Pos Pred Value : 0.9286          
##          Neg Pred Value : 0.9661          
##              Prevalence : 0.6199          
##          Detection Rate : 0.6082          
##    Detection Prevalence : 0.6550          
##       Balanced Accuracy : 0.9290          
##                                           
##        'Positive' Class : Benign          
## 

OneR

library("RWeka")
learn_1r <- OneR(diagnosis~., data=train)
pre_1r <- predict(learn_1r, test[,-1])
cm_1r   <- confusionMatrix(pre_1r, test$diagnosis)
cm_1r
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign        97        17
##   Malignant      9        48
##                                           
##                Accuracy : 0.848           
##                  95% CI : (0.7852, 0.8982)
##     No Information Rate : 0.6199          
##     P-Value [Acc > NIR] : 4.911e-11       
##                                           
##                   Kappa : 0.6695          
##  Mcnemar's Test P-Value : 0.1698          
##                                           
##             Sensitivity : 0.9151          
##             Specificity : 0.7385          
##          Pos Pred Value : 0.8509          
##          Neg Pred Value : 0.8421          
##              Prevalence : 0.6199          
##          Detection Rate : 0.5673          
##    Detection Prevalence : 0.6667          
##       Balanced Accuracy : 0.8268          
##                                           
##        'Positive' Class : Benign          
## 

JRip

learn_jrip <- JRip(diagnosis ~ ., data=train)
pre_jrip <- predict(learn_jrip, test[,-1])
cm_jrip <- confusionMatrix(pre_jrip, test$diagnosis)        
cm_jrip
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       102         7
##   Malignant      4        58
##                                           
##                Accuracy : 0.9357          
##                  95% CI : (0.8878, 0.9675)
##     No Information Rate : 0.6199          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8623          
##  Mcnemar's Test P-Value : 0.5465          
##                                           
##             Sensitivity : 0.9623          
##             Specificity : 0.8923          
##          Pos Pred Value : 0.9358          
##          Neg Pred Value : 0.9355          
##              Prevalence : 0.6199          
##          Detection Rate : 0.5965          
##    Detection Prevalence : 0.6374          
##       Balanced Accuracy : 0.9273          
##                                           
##        'Positive' Class : Benign          
## 

naiveBayes

library(e1071)
learn_nb <- naiveBayes(train[,-1], train$diagnosis)
pre_nb <- predict(learn_nb, test[,-1])
cm_nb     <- confusionMatrix(pre_nb, test$diagnosis)
cm_nb
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       105        10
##   Malignant      1        55
##                                           
##                Accuracy : 0.9357          
##                  95% CI : (0.8878, 0.9675)
##     No Information Rate : 0.6199          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.8597          
##  Mcnemar's Test P-Value : 0.01586         
##                                           
##             Sensitivity : 0.9906          
##             Specificity : 0.8462          
##          Pos Pred Value : 0.9130          
##          Neg Pred Value : 0.9821          
##              Prevalence : 0.6199          
##          Detection Rate : 0.6140          
##    Detection Prevalence : 0.6725          
##       Balanced Accuracy : 0.9184          
##                                           
##        'Positive' Class : Benign          
## 

naiveBayes - Tune

total_accuracy_nb <- function(train, test){
    library(e1071)
    library(caret)
    accuracy1 <- NULL; accuracy2 <- NULL
    for(i in 1:100){
        learn_imp_nb <- naiveBayes(train[,-1], train$diagnosis, laplace=i)    
        p_nb <- predict(learn_imp_nb, test[,-1]) 
        accuracy1 <- confusionMatrix(p_nb, test$diagnosis)
        accuracy2[i] <- accuracy1$overall[1]
    }
    accuracy2
}

b <- total_accuracy_nb(train,test)
opt_laplace <- which(b==max(b))[1]

learn_imp_nb <- naiveBayes(train[,-1], train$diagnosis, laplace=opt_laplace)
pre_imp_nb <- predict(learn_imp_nb, test[,-1])
cm_imp_nb <- confusionMatrix(pre_imp_nb, test$diagnosis)        
cm_imp_nb
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       105        10
##   Malignant      1        55
##                                           
##                Accuracy : 0.9357          
##                  95% CI : (0.8878, 0.9675)
##     No Information Rate : 0.6199          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.8597          
##  Mcnemar's Test P-Value : 0.01586         
##                                           
##             Sensitivity : 0.9906          
##             Specificity : 0.8462          
##          Pos Pred Value : 0.9130          
##          Neg Pred Value : 0.9821          
##              Prevalence : 0.6199          
##          Detection Rate : 0.6140          
##    Detection Prevalence : 0.6725          
##       Balanced Accuracy : 0.9184          
##                                           
##        'Positive' Class : Benign          
## 

randomForest

library(randomForest)
learn_rf <- randomForest(diagnosis~., data=train, ntree=100, proximity=T)
pre_rf   <- predict(learn_rf, test[,-1])
cm_rf    <- confusionMatrix(pre_rf, test$diagnosis)
cm_rf
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       104         9
##   Malignant      2        56
##                                           
##                Accuracy : 0.9357          
##                  95% CI : (0.8878, 0.9675)
##     No Information Rate : 0.6199          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.8606          
##  Mcnemar's Test P-Value : 0.07044         
##                                           
##             Sensitivity : 0.9811          
##             Specificity : 0.8615          
##          Pos Pred Value : 0.9204          
##          Neg Pred Value : 0.9655          
##              Prevalence : 0.6199          
##          Detection Rate : 0.6082          
##    Detection Prevalence : 0.6608          
##       Balanced Accuracy : 0.9213          
##                                           
##        'Positive' Class : Benign          
## 

ctree

library(party)
learn_ct <- ctree(diagnosis~., data=train, controls=ctree_control(maxdepth=2))
pre_ct   <- predict(learn_ct, test[,-1])
cm_ct    <- confusionMatrix(pre_ct, test$diagnosis)
cm_ct
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       104        21
##   Malignant      2        44
##                                          
##                Accuracy : 0.8655         
##                  95% CI : (0.805, 0.9128)
##     No Information Rate : 0.6199         
##     P-Value [Acc > NIR] : 9.936e-13      
##                                          
##                   Kappa : 0.6975         
##  Mcnemar's Test P-Value : 0.0001746      
##                                          
##             Sensitivity : 0.9811         
##             Specificity : 0.6769         
##          Pos Pred Value : 0.8320         
##          Neg Pred Value : 0.9565         
##              Prevalence : 0.6199         
##          Detection Rate : 0.6082         
##    Detection Prevalence : 0.7310         
##       Balanced Accuracy : 0.8290         
##                                          
##        'Positive' Class : Benign         
## 

KNN

library(class)
pre_knn <- knn(train = train[,-1], test = test[,-1], cl = train[,1], k=25, prob=T)
cm_knn  <- confusionMatrix(pre_knn, test$diagnosis)
cm_knn
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       105        21
##   Malignant      1        44
##                                           
##                Accuracy : 0.8713          
##                  95% CI : (0.8117, 0.9176)
##     No Information Rate : 0.6199          
##     P-Value [Acc > NIR] : 2.461e-13       
##                                           
##                   Kappa : 0.7097          
##  Mcnemar's Test P-Value : 5.104e-05       
##                                           
##             Sensitivity : 0.9906          
##             Specificity : 0.6769          
##          Pos Pred Value : 0.8333          
##          Neg Pred Value : 0.9778          
##              Prevalence : 0.6199          
##          Detection Rate : 0.6140          
##    Detection Prevalence : 0.7368          
##       Balanced Accuracy : 0.8337          
##                                           
##        'Positive' Class : Benign          
## 

GBM

library(gbm)
test_gbm <- gbm(diagnosis~., data=train, distribution="gaussian",n.trees = 10000,
                shrinkage = 0.01, interaction.depth = 4, bag.fraction=0.5, train.fraction=0.5,n.minobsinnode=10,cv.folds=3,keep.data=TRUE,verbose=FALSE,n.cores=1)
best.iter <- gbm.perf(test_gbm, method="cv",plot.it=FALSE)
fitControl = trainControl(method="cv", number=5, returnResamp="all")
learn_gbm = train(diagnosis~., data=train, method="gbm", distribution="bernoulli", trControl=fitControl, verbose=F, tuneGrid=data.frame(.n.trees=best.iter, .shrinkage=0.01, .interaction.depth=1, .n.minobsinnode=1))
pre_gbm <- predict(learn_gbm, test[,-1])
cm_gbm <- confusionMatrix(pre_gbm, test$diagnosis)
cm_gbm
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       105         8
##   Malignant      1        57
##                                           
##                Accuracy : 0.9474          
##                  95% CI : (0.9024, 0.9757)
##     No Information Rate : 0.6199          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8859          
##  Mcnemar's Test P-Value : 0.0455          
##                                           
##             Sensitivity : 0.9906          
##             Specificity : 0.8769          
##          Pos Pred Value : 0.9292          
##          Neg Pred Value : 0.9828          
##              Prevalence : 0.6199          
##          Detection Rate : 0.6140          
##    Detection Prevalence : 0.6608          
##       Balanced Accuracy : 0.9337          
##                                           
##        'Positive' Class : Benign          
## 

adaBoost

library(rpart)
library(ada)
control <- rpart.control(cp = -1, maxdepth = 14,maxcompete = 1,xval = 0)
learn_ada <- ada(diagnosis~., data = train, test.x = train[,-1], test.y = train[,1], type = "gentle", control = control, iter = 70)
pre_ada <- predict(learn_ada, test[,-1])
cm_ada <- confusionMatrix(pre_ada, test$diagnosis)
cm_ada
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       104         8
##   Malignant      2        57
##                                           
##                Accuracy : 0.9415          
##                  95% CI : (0.8951, 0.9716)
##     No Information Rate : 0.6199          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8737          
##  Mcnemar's Test P-Value : 0.1138          
##                                           
##             Sensitivity : 0.9811          
##             Specificity : 0.8769          
##          Pos Pred Value : 0.9286          
##          Neg Pred Value : 0.9661          
##              Prevalence : 0.6199          
##          Detection Rate : 0.6082          
##    Detection Prevalence : 0.6550          
##       Balanced Accuracy : 0.9290          
##                                           
##        'Positive' Class : Benign          
## 

SVM

learn_svm <- svm(diagnosis~., data=train)
pre_svm <- predict(learn_svm, test[,-1])
cm_svm <- confusionMatrix(pre_svm, test$diagnosis)
cm_svm
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       105         9
##   Malignant      1        56
##                                           
##                Accuracy : 0.9415          
##                  95% CI : (0.8951, 0.9716)
##     No Information Rate : 0.6199          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.8729          
##  Mcnemar's Test P-Value : 0.02686         
##                                           
##             Sensitivity : 0.9906          
##             Specificity : 0.8615          
##          Pos Pred Value : 0.9211          
##          Neg Pred Value : 0.9825          
##              Prevalence : 0.6199          
##          Detection Rate : 0.6140          
##    Detection Prevalence : 0.6667          
##       Balanced Accuracy : 0.9261          
##                                           
##        'Positive' Class : Benign          
## 

SVM - Tune

gamma <- seq(0,0.1,0.005)
cost <- 2^(0:5)
parms <- expand.grid(cost=cost, gamma=gamma)    ## 231

total_accuracy_svm <- function(train, test){
    accuracy1 <- NULL; accuracy2 <- NULL
    for(i in 1:NROW(parms)){        
        learn_svm <- svm(diagnosis~., data=train, gamma=parms$gamma[i], cost=parms$cost[i])
        pre_svm <- predict(learn_svm, test[,-1])
        accuracy1 <- confusionMatrix(pre_svm, test$diagnosis)
        accuracy2[i] <- accuracy1$overall[1]
    }
    accuracy2
}

c <- total_accuracy_svm(train,test)
opt_parms <- which(c==max(c))[1]


learn_imp_svm <- svm(diagnosis~., data=train, cost=parms$cost[opt_parms], gamma=parms$gamma[opt_parms])
pre_imp_svm <- predict(learn_imp_svm, test[,-1])
cm_imp_svm <- confusionMatrix(pre_imp_svm, test$diagnosis)
cm_imp_svm
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       106         4
##   Malignant      0        61
##                                           
##                Accuracy : 0.9766          
##                  95% CI : (0.9412, 0.9936)
##     No Information Rate : 0.6199          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9498          
##  Mcnemar's Test P-Value : 0.1336          
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.9385          
##          Pos Pred Value : 0.9636          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.6199          
##          Detection Rate : 0.6199          
##    Detection Prevalence : 0.6433          
##       Balanced Accuracy : 0.9692          
##                                           
##        'Positive' Class : Benign          
## 

4-4) Visualize to compare the accuracy of all methods

col <- c("#ed3b3b", "#0099ff")
par(mfrow=c(3,5))
fourfoldplot(cm_c50$table, color = col, conf.level = 0, margin = 1, main=paste("C5.0 (",round(cm_c50$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_imp_c50$table, color = col, conf.level = 0, margin = 1, main=paste("Improve C5.0 (",round(cm_imp_c50$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_rp$table, color = col, conf.level = 0, margin = 1, main=paste("RPart (",round(cm_rp$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_pru$table, color = col, conf.level = 0, margin = 1, main=paste("Prune (",round(cm_pru$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_1r$table, color = col, conf.level = 0, margin = 1, main=paste("OneR (",round(cm_1r$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_jrip$table, color = col, conf.level = 0, margin = 1, main=paste("JRip (",round(cm_jrip$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_ct$table, color = col, conf.level = 0, margin = 1, main=paste("CTree (",round(cm_ct$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_nb$table, color = col, conf.level = 0, margin = 1, main=paste("NaiveBayes (",round(cm_nb$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_imp_nb$table, color = col, conf.level = 0, margin = 1, main=paste("Improve NaiveBayes\n(",round(cm_imp_nb$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_knn$table, color = col, conf.level = 0, margin = 1, main=paste("KNN (",round(cm_knn$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_rf$table, color = col, conf.level = 0, margin = 1, main=paste("RandomForest (",round(cm_rf$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_gbm$table, color = col, conf.level = 0, margin = 1, main=paste("GBM (",round(cm_gbm$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_ada$table, color = col, conf.level = 0, margin = 1, main=paste("AdaBoost (",round(cm_ada$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_svm$table, color = col, conf.level = 0, margin = 1, main=paste("SVM (",round(cm_svm$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_imp_svm$table, color = col, conf.level = 0, margin = 1, main=paste("Improve SVM (",round(cm_imp_svm$overall[1]*100),"%)",sep=""))

4-5) Select a best prediction model according to high accuracy

opt_predict <- c(cm_c50$overall[1], cm_imp_c50$overall[1], cm_rp$overall[1], cm_pru$overall[1], cm_1r$overall[1], cm_jrip$overall[1], cm_ct$overall[1], cm_nb$overall[1], cm_imp_nb$overall[1], cm_knn$overall[1], cm_rf$overall[1], cm_gbm$overall[1], cm_ada$overall[1], cm_svm$overall[1], cm_imp_svm$overall[1])
names(opt_predict) <- c("c50","imp_c50","rpart","prune","1r","jrip","ctree","nb","imp_nb","knn","rf","gbm","ada","svm","imp_svm")
best_predict_model <- subset(opt_predict, opt_predict==max(opt_predict))
best_predict_model
##   imp_svm 
## 0.9766082

4-6) Apply every ML methods(that I know) to data with the Corelation based feature selection :

our features are: ‘texture_mean’, ‘area_mean’, ‘smoothness_mean’, ‘concavity_mean’, ‘symmetry_mean’, ‘fractal_dimension_mean’, ‘area_se’, ‘smoothness_se’, ‘fractal_dimension_se’, ‘smoothness_worst’, ‘concavity_worst’, ‘symmetry_worst’

wbcd2=wbcd[,c(1,2,3,6,7,10,12,13,16,17,20,22,23,26,27,30)]
nrows <- NROW(wbcd2)
# set.seed(1)               ## fix random value
index <- sample(1:nrows, 0.7 * nrows)   ## shuffle and divide

#train <- wbcd                  ## 569 test data (100%)
train <- wbcd2[index,]          ## 398 test data (70%)
test <- wbcd2[-index,]                  ## 171 test data (30%)
library(caret)

C5.0

library(C50)
learn_c50 <- C5.0(train[,-1],train$diagnosis)
pre_c50 <- predict(learn_c50, test[,-1])
cm_c50 <- confusionMatrix(pre_c50, test$diagnosis)
cm_c50
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       104         4
##   Malignant      3        60
##                                           
##                Accuracy : 0.9591          
##                  95% CI : (0.9175, 0.9834)
##     No Information Rate : 0.6257          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9123          
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9720          
##             Specificity : 0.9375          
##          Pos Pred Value : 0.9630          
##          Neg Pred Value : 0.9524          
##              Prevalence : 0.6257          
##          Detection Rate : 0.6082          
##    Detection Prevalence : 0.6316          
##       Balanced Accuracy : 0.9547          
##                                           
##        'Positive' Class : Benign          
## 

C5.0 - Tune

total_accuracy_c50 <- function(train, test){
    accuracy1 <- NULL; accuracy2 <- NULL
    for(i in 1:100){
        learn_imp_c50 <- C5.0(train[,-1],train$diagnosis,trials = i)      
        p_c50 <- predict(learn_imp_c50, test[,-1]) 
        accuracy1 <- confusionMatrix(p_c50, test$diagnosis)
        accuracy2[i] <- accuracy1$overall[1]
    }
    accuracy2
}

a <- total_accuracy_c50(train,test)
opt_trials <- which(a==max(a))[1]   
        
learn_imp_c50 <- C5.0(train[,-1],train$diagnosis,trials=opt_trials) 
pre_imp_c50 <- predict(learn_imp_c50, test[,-1])
cm_imp_c50 <- confusionMatrix(pre_imp_c50, test$diagnosis)
cm_imp_c50
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       106         2
##   Malignant      1        62
##                                           
##                Accuracy : 0.9825          
##                  95% CI : (0.9496, 0.9964)
##     No Information Rate : 0.6257          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9624          
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9907          
##             Specificity : 0.9688          
##          Pos Pred Value : 0.9815          
##          Neg Pred Value : 0.9841          
##              Prevalence : 0.6257          
##          Detection Rate : 0.6199          
##    Detection Prevalence : 0.6316          
##       Balanced Accuracy : 0.9797          
##                                           
##        'Positive' Class : Benign          
## 

rpart

library(rpart)
learn_rp <- rpart(diagnosis~.,data=train,control=rpart.control(minsplit=2))
pre_rp <- predict(learn_rp, test[,-1], type="class")
cm_rp  <- confusionMatrix(pre_rp, test$diagnosis)   
cm_rp
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       102         5
##   Malignant      5        59
##                                           
##                Accuracy : 0.9415          
##                  95% CI : (0.8951, 0.9716)
##     No Information Rate : 0.6257          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8751          
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9533          
##             Specificity : 0.9219          
##          Pos Pred Value : 0.9533          
##          Neg Pred Value : 0.9219          
##              Prevalence : 0.6257          
##          Detection Rate : 0.5965          
##    Detection Prevalence : 0.6257          
##       Balanced Accuracy : 0.9376          
##                                           
##        'Positive' Class : Benign          
## 

Prune

learn_pru <- prune(learn_rp, cp=learn_rp$cptable[which.min(learn_rp$cptable[,"xerror"]),"CP"])
pre_pru <- predict(learn_pru, test[,-1], type="class")
cm_pru <-confusionMatrix(pre_pru, test$diagnosis)           
cm_pru
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       102         5
##   Malignant      5        59
##                                           
##                Accuracy : 0.9415          
##                  95% CI : (0.8951, 0.9716)
##     No Information Rate : 0.6257          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8751          
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9533          
##             Specificity : 0.9219          
##          Pos Pred Value : 0.9533          
##          Neg Pred Value : 0.9219          
##              Prevalence : 0.6257          
##          Detection Rate : 0.5965          
##    Detection Prevalence : 0.6257          
##       Balanced Accuracy : 0.9376          
##                                           
##        'Positive' Class : Benign          
## 

OneR

library("RWeka")
learn_1r <- OneR(diagnosis~., data=train)
pre_1r <- predict(learn_1r, test[,-1])
cm_1r   <- confusionMatrix(pre_1r, test$diagnosis)
cm_1r
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       105         8
##   Malignant      2        56
##                                           
##                Accuracy : 0.9415          
##                  95% CI : (0.8951, 0.9716)
##     No Information Rate : 0.6257          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8727          
##  Mcnemar's Test P-Value : 0.1138          
##                                           
##             Sensitivity : 0.9813          
##             Specificity : 0.8750          
##          Pos Pred Value : 0.9292          
##          Neg Pred Value : 0.9655          
##              Prevalence : 0.6257          
##          Detection Rate : 0.6140          
##    Detection Prevalence : 0.6608          
##       Balanced Accuracy : 0.9282          
##                                           
##        'Positive' Class : Benign          
## 

JRip

learn_jrip <- JRip(diagnosis ~ ., data=train)
pre_jrip <- predict(learn_jrip, test[,-1])
cm_jrip <- confusionMatrix(pre_jrip, test$diagnosis)        
cm_jrip
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       102         6
##   Malignant      5        58
##                                           
##                Accuracy : 0.9357          
##                  95% CI : (0.8878, 0.9675)
##     No Information Rate : 0.6257          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8622          
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9533          
##             Specificity : 0.9062          
##          Pos Pred Value : 0.9444          
##          Neg Pred Value : 0.9206          
##              Prevalence : 0.6257          
##          Detection Rate : 0.5965          
##    Detection Prevalence : 0.6316          
##       Balanced Accuracy : 0.9298          
##                                           
##        'Positive' Class : Benign          
## 

naiveBayes

library(e1071)
learn_nb <- naiveBayes(train[,-1], train$diagnosis)
pre_nb <- predict(learn_nb, test[,-1])
cm_nb     <- confusionMatrix(pre_nb, test$diagnosis)
cm_nb
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       100         2
##   Malignant      7        62
##                                           
##                Accuracy : 0.9474          
##                  95% CI : (0.9024, 0.9757)
##     No Information Rate : 0.6257          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8894          
##  Mcnemar's Test P-Value : 0.1824          
##                                           
##             Sensitivity : 0.9346          
##             Specificity : 0.9688          
##          Pos Pred Value : 0.9804          
##          Neg Pred Value : 0.8986          
##              Prevalence : 0.6257          
##          Detection Rate : 0.5848          
##    Detection Prevalence : 0.5965          
##       Balanced Accuracy : 0.9517          
##                                           
##        'Positive' Class : Benign          
## 

naiveBayes - Tune

total_accuracy_nb <- function(train, test){
    library(e1071)
    library(caret)
    accuracy1 <- NULL; accuracy2 <- NULL
    for(i in 1:100){
        learn_imp_nb <- naiveBayes(train[,-1], train$diagnosis, laplace=i)    
        p_nb <- predict(learn_imp_nb, test[,-1]) 
        accuracy1 <- confusionMatrix(p_nb, test$diagnosis)
        accuracy2[i] <- accuracy1$overall[1]
    }
    accuracy2
}

b <- total_accuracy_nb(train,test)
opt_laplace <- which(b==max(b))[1]

learn_imp_nb <- naiveBayes(train[,-1], train$diagnosis, laplace=opt_laplace)
pre_imp_nb <- predict(learn_imp_nb, test[,-1])
cm_imp_nb <- confusionMatrix(pre_imp_nb, test$diagnosis)        
cm_imp_nb
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       100         2
##   Malignant      7        62
##                                           
##                Accuracy : 0.9474          
##                  95% CI : (0.9024, 0.9757)
##     No Information Rate : 0.6257          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8894          
##  Mcnemar's Test P-Value : 0.1824          
##                                           
##             Sensitivity : 0.9346          
##             Specificity : 0.9688          
##          Pos Pred Value : 0.9804          
##          Neg Pred Value : 0.8986          
##              Prevalence : 0.6257          
##          Detection Rate : 0.5848          
##    Detection Prevalence : 0.5965          
##       Balanced Accuracy : 0.9517          
##                                           
##        'Positive' Class : Benign          
## 

randomForest

library(randomForest)
learn_rf <- randomForest(diagnosis~., data=train, ntree=100, proximity=T)
pre_rf   <- predict(learn_rf, test[,-1])
cm_rf    <- confusionMatrix(pre_rf, test$diagnosis)
cm_rf
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       106         2
##   Malignant      1        62
##                                           
##                Accuracy : 0.9825          
##                  95% CI : (0.9496, 0.9964)
##     No Information Rate : 0.6257          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9624          
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9907          
##             Specificity : 0.9688          
##          Pos Pred Value : 0.9815          
##          Neg Pred Value : 0.9841          
##              Prevalence : 0.6257          
##          Detection Rate : 0.6199          
##    Detection Prevalence : 0.6316          
##       Balanced Accuracy : 0.9797          
##                                           
##        'Positive' Class : Benign          
## 

ctree

library(party)
learn_ct <- ctree(diagnosis~., data=train, controls=ctree_control(maxdepth=2))
pre_ct   <- predict(learn_ct, test[,-1])
cm_ct    <- confusionMatrix(pre_ct, test$diagnosis)
cm_ct
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign        99         3
##   Malignant      8        61
##                                           
##                Accuracy : 0.9357          
##                  95% CI : (0.8878, 0.9675)
##     No Information Rate : 0.6257          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8648          
##  Mcnemar's Test P-Value : 0.2278          
##                                           
##             Sensitivity : 0.9252          
##             Specificity : 0.9531          
##          Pos Pred Value : 0.9706          
##          Neg Pred Value : 0.8841          
##              Prevalence : 0.6257          
##          Detection Rate : 0.5789          
##    Detection Prevalence : 0.5965          
##       Balanced Accuracy : 0.9392          
##                                           
##        'Positive' Class : Benign          
## 

KNN

library(class)
pre_knn <- knn(train = train[,-1], test = test[,-1], cl = train[,1], k=25, prob=T)
cm_knn  <- confusionMatrix(pre_knn, test$diagnosis)
cm_knn
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       106         5
##   Malignant      1        59
##                                          
##                Accuracy : 0.9649         
##                  95% CI : (0.9252, 0.987)
##     No Information Rate : 0.6257         
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.9241         
##  Mcnemar's Test P-Value : 0.2207         
##                                          
##             Sensitivity : 0.9907         
##             Specificity : 0.9219         
##          Pos Pred Value : 0.9550         
##          Neg Pred Value : 0.9833         
##              Prevalence : 0.6257         
##          Detection Rate : 0.6199         
##    Detection Prevalence : 0.6491         
##       Balanced Accuracy : 0.9563         
##                                          
##        'Positive' Class : Benign         
## 

GBM

library(gbm)
test_gbm <- gbm(diagnosis~., data=train, distribution="gaussian",n.trees = 10000,
                shrinkage = 0.01, interaction.depth = 4, bag.fraction=0.5, train.fraction=0.5,n.minobsinnode=10,cv.folds=3,keep.data=TRUE,verbose=FALSE,n.cores=1)
best.iter <- gbm.perf(test_gbm, method="cv",plot.it=FALSE)
fitControl = trainControl(method="cv", number=5, returnResamp="all")
learn_gbm = train(diagnosis~., data=train, method="gbm", distribution="bernoulli", trControl=fitControl, verbose=F, tuneGrid=data.frame(.n.trees=best.iter, .shrinkage=0.01, .interaction.depth=1, .n.minobsinnode=1))
pre_gbm <- predict(learn_gbm, test[,-1])
cm_gbm <- confusionMatrix(pre_gbm, test$diagnosis)
cm_gbm
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       107         7
##   Malignant      0        57
##                                           
##                Accuracy : 0.9591          
##                  95% CI : (0.9175, 0.9834)
##     No Information Rate : 0.6257          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.9106          
##  Mcnemar's Test P-Value : 0.02334         
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.8906          
##          Pos Pred Value : 0.9386          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.6257          
##          Detection Rate : 0.6257          
##    Detection Prevalence : 0.6667          
##       Balanced Accuracy : 0.9453          
##                                           
##        'Positive' Class : Benign          
## 

adaBoost

library(rpart)
library(ada)
control <- rpart.control(cp = -1, maxdepth = 14,maxcompete = 1,xval = 0)
learn_ada <- ada(diagnosis~., data = train, test.x = train[,-1], test.y = train[,1], type = "gentle", control = control, iter = 70)
pre_ada <- predict(learn_ada, test[,-1])
cm_ada <- confusionMatrix(pre_ada, test$diagnosis)
cm_ada
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       106         4
##   Malignant      1        60
##                                           
##                Accuracy : 0.9708          
##                  95% CI : (0.9331, 0.9904)
##     No Information Rate : 0.6257          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.937           
##  Mcnemar's Test P-Value : 0.3711          
##                                           
##             Sensitivity : 0.9907          
##             Specificity : 0.9375          
##          Pos Pred Value : 0.9636          
##          Neg Pred Value : 0.9836          
##              Prevalence : 0.6257          
##          Detection Rate : 0.6199          
##    Detection Prevalence : 0.6433          
##       Balanced Accuracy : 0.9641          
##                                           
##        'Positive' Class : Benign          
## 

SVM

learn_svm <- svm(diagnosis~., data=train)
pre_svm <- predict(learn_svm, test[,-1])
cm_svm <- confusionMatrix(pre_svm, test$diagnosis)
cm_svm
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       106         1
##   Malignant      1        63
##                                           
##                Accuracy : 0.9883          
##                  95% CI : (0.9584, 0.9986)
##     No Information Rate : 0.6257          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.975           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9907          
##             Specificity : 0.9844          
##          Pos Pred Value : 0.9907          
##          Neg Pred Value : 0.9844          
##              Prevalence : 0.6257          
##          Detection Rate : 0.6199          
##    Detection Prevalence : 0.6257          
##       Balanced Accuracy : 0.9875          
##                                           
##        'Positive' Class : Benign          
## 

SVM - Tune

gamma <- seq(0,0.1,0.005)
cost <- 2^(0:5)
parms <- expand.grid(cost=cost, gamma=gamma)    ## 231

total_accuracy_svm <- function(train, test){
    accuracy1 <- NULL; accuracy2 <- NULL
    for(i in 1:NROW(parms)){        
        learn_svm <- svm(diagnosis~., data=train, gamma=parms$gamma[i], cost=parms$cost[i])
        pre_svm <- predict(learn_svm, test[,-1])
        accuracy1 <- confusionMatrix(pre_svm, test$diagnosis)
        accuracy2[i] <- accuracy1$overall[1]
    }
    accuracy2
}

c <- total_accuracy_svm(train,test)
opt_parms <- which(c==max(c))[1]


learn_imp_svm <- svm(diagnosis~., data=train, cost=parms$cost[opt_parms], gamma=parms$gamma[opt_parms])
pre_imp_svm <- predict(learn_imp_svm, test[,-1])
cm_imp_svm <- confusionMatrix(pre_imp_svm, test$diagnosis)
cm_imp_svm
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       107         0
##   Malignant      0        64
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9787, 1)
##     No Information Rate : 0.6257     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.6257     
##          Detection Rate : 0.6257     
##    Detection Prevalence : 0.6257     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : Benign     
## 

4-7) Visualize to compare the accuracy of all methods

col <- c("#ed3b3b", "#0099ff")
par(mfrow=c(3,5))
fourfoldplot(cm_c50$table, color = col, conf.level = 0, margin = 1, main=paste("C5.0 (",round(cm_c50$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_imp_c50$table, color = col, conf.level = 0, margin = 1, main=paste("Improve C5.0 (",round(cm_imp_c50$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_rp$table, color = col, conf.level = 0, margin = 1, main=paste("RPart (",round(cm_rp$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_pru$table, color = col, conf.level = 0, margin = 1, main=paste("Prune (",round(cm_pru$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_1r$table, color = col, conf.level = 0, margin = 1, main=paste("OneR (",round(cm_1r$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_jrip$table, color = col, conf.level = 0, margin = 1, main=paste("JRip (",round(cm_jrip$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_ct$table, color = col, conf.level = 0, margin = 1, main=paste("CTree (",round(cm_ct$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_nb$table, color = col, conf.level = 0, margin = 1, main=paste("NaiveBayes (",round(cm_nb$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_imp_nb$table, color = col, conf.level = 0, margin = 1, main=paste("Improve NaiveBayes\n(",round(cm_imp_nb$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_knn$table, color = col, conf.level = 0, margin = 1, main=paste("KNN (",round(cm_knn$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_rf$table, color = col, conf.level = 0, margin = 1, main=paste("RandomForest (",round(cm_rf$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_gbm$table, color = col, conf.level = 0, margin = 1, main=paste("GBM (",round(cm_gbm$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_ada$table, color = col, conf.level = 0, margin = 1, main=paste("AdaBoost (",round(cm_ada$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_svm$table, color = col, conf.level = 0, margin = 1, main=paste("SVM (",round(cm_svm$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_imp_svm$table, color = col, conf.level = 0, margin = 1, main=paste("Improve SVM (",round(cm_imp_svm$overall[1]*100),"%)",sep=""))

### 4-5) Select a best prediction model according to high accuracy

opt_predict <- c(cm_c50$overall[1], cm_imp_c50$overall[1], cm_rp$overall[1], cm_pru$overall[1], cm_1r$overall[1], cm_jrip$overall[1], cm_ct$overall[1], cm_nb$overall[1], cm_imp_nb$overall[1], cm_knn$overall[1], cm_rf$overall[1], cm_gbm$overall[1], cm_ada$overall[1], cm_svm$overall[1], cm_imp_svm$overall[1])
names(opt_predict) <- c("c50","imp_c50","rpart","prune","1r","jrip","ctree","nb","imp_nb","knn","rf","gbm","ada","svm","imp_svm")
best_predict_model <- subset(opt_predict, opt_predict==max(opt_predict))
best_predict_model
## imp_svm 
##       1

5. Prepare Patient data for testing function

5-1) Import patient data

patient <- read.csv("breast_cancer.csv", header=T, stringsAsFactors=F)
patient$X <- NULL

Malignant patient

M <- patient[19,]               ## 19th patient
M[,c(1,2)]          ## Malignant
##        id diagnosis
## 19 849014         M

Benign patient

B <- patient[20,]               ## 20th patient          
B[,c(1,2)]          ## Benign
##         id diagnosis
## 20 8510426         B

5-2) Delete diagnosis column for testing

M$diagnosis <- NULL
B$diagnosis <- NULL

6. Patient Cancer Diagnosis Prediction Function (use only 1 test data)

6-1) Patient Diagnosis Function

Use ‘Improve SVM Algorithm’ as default, Since it’s rated as the best predict_model.

cancer_diagnosis_predict <- function(new, method=learn_imp_svm) {
    new_pre <- predict(method, new[,-1])
    new_res <- as.character(new_pre)
    return(paste("Patient ID: ",new[,1],"  =>  Result: ", new_res, sep=""))
}

6-2) Testing Function

Benign test data

default = improve svm

cancer_diagnosis_predict(B)         
## [1] "Patient ID: 8510426  =>  Result: Benign"

Use other ML methods

cancer_diagnosis_predict(B,learn_imp_c50)
## [1] "Patient ID: 8510426  =>  Result: Benign"

Malignant test data

default = improve svm

cancer_diagnosis_predict(M)
## [1] "Patient ID: 849014  =>  Result: Malignant"

Use other ML methods

cancer_diagnosis_predict(M,learn_imp_c50)   
## [1] "Patient ID: 849014  =>  Result: Malignant"

7. Visualize (Probabilty Density Function Graph)

7-1) Create Visualize Function

cancer_summary <- function(new,data) {

## [a] Reshape the new dataset for ggplot
library(reshape2)
m_train <- melt(data, id="diagnosis")
m_new <- melt(new[,-1])


## [b] Variable To Highlight the key factors (geom_vline-RED)
key_factors <- c("radius_mean","perimeter_mean","area_mean","perimeter_worst",
                 "texture_worst","radius_worst","symmetry_se","compactness_worst",
                 "concavity_worst","dimension_worst")

key_col <- ifelse(m_new$variable %in% key_factors,"red","black")


## [c] Save mean of Malignant value & colors
library(dplyr)
mal_mean <- subset(data, diagnosis=="Malignant", select=-1)
mal_mean <- apply(mal_mean,2,mean)

library(stringr)
mal_col <- ifelse((round(m_new$value,3) > mal_mean) & (str_count(m_new$variable, 'worst') < 1), "red", "black")



## [d] Save titles : Main title, Patient Diagnosis

title <- "Breast Cancer Diagnosis Plot"
subtitle <- cancer_diagnosis_predict(new)



## ★[e] View plot highlighting your manual key factor
library(ggplot2)

res_key <- ggplot(m_train, aes(x=value,color=diagnosis, fill=diagnosis))+
    geom_histogram(aes(y=..density..), alpha=0.5, position="identity", bins=50)+
    geom_density(alpha=.2)+
    scale_color_manual(values=c("#15c3c9","#f87b72"))+
    scale_fill_manual(values=c("#61d4d6","#f5a7a1"))+
    geom_vline(data=m_new, aes(xintercept=value), 
               color=key_col, size=1.5)+
    geom_label(data=m_new, aes(x=Inf, y=Inf, label=round(value,3)), nudge_y=2,  
               vjust = "top", hjust = "right", fill="white", color="black")+
    labs(title=paste(title,"(highlight Key Factors)"), subtitle=subtitle)+
    theme(plot.title = element_text(face='bold', colour='black', hjust=0.5, size=15))+
    theme(plot.subtitle=element_text(lineheight=0.8, hjust=0.5))+
    labs(caption="[Training 569 wisc cancer diagnostic patient data]")+
    facet_wrap(~variable, scales="free", ncol=5)



## ★[f] View plots highlighting values above average of malignant patient
res_mean <- ggplot(m_train, aes(x=value,color=diagnosis, fill=diagnosis))+
    geom_histogram(aes(y=..density..), alpha=0.5, position="identity", bins=50)+
    geom_density(alpha=.2)+
    scale_color_manual(values=c("#15c3c9","#f87b72"))+
    scale_fill_manual(values=c("#61d4d6","#f5a7a1"))+
    geom_vline(data=m_new, aes(xintercept=value), 
               color=mal_col, size=1.5)+
    geom_label(data=m_new, aes(x=Inf, y=Inf, label=round(value,3)), nudge_y=2,  
               vjust = "top", hjust = "right", fill="white", color="black")+
    labs(title=paste(title,"(highlight Above malignant average)"), subtitle=subtitle)+
    theme(plot.title = element_text(face='bold', colour='black', hjust=0.5, size=15))+
    theme(plot.subtitle=element_text(lineheight=0.8, hjust=0.5, size=12))+
    labs(caption="[Training 569 wisc cancer diagnostic patient data]")+
    facet_wrap(~variable, scales="free", ncol=5)



## [g] output graph
res_mean
#res_key

}

7-2) Testing Function

Benign

cancer_summary(B, wbcd)

Malignant

cancer_summary(M, wbcd)

8. Visualize (Radar)

8-1) Create Visualize Function

cancer_radar <- function(new,data) {

## [a] Radar Function
coord_radar <- function (theta = "x", start = 0, direction = 1) 
{
        theta <- match.arg(theta, c("x", "y"))
        r <- ifelse(theta == "x", "y", "x")
        ggproto("CoordRadar", CoordPolar, theta = theta, r = r, start = start, 
                direction = sign(direction),
                is_linear = function(coord) TRUE)
}


## [b] Normalize Function -> you can use rescale instead.
normalize <- function(x) {
    return((x-min(x))/(max(x)-min(x)))
}


## [c] Get average from Normal(Benign) Data to set standards (Grey area)
b1 <- subset(data, diagnosis=="Benign", select=-1)
b2 <- as.data.frame(lapply(b1,normalize))           
be <- colMeans(b2)


## [d] Normalize Patient Data to compare with normal dataset
p_new <- (new[,-1]-apply(b1,2,min))/(apply(b1,2,max)-apply(b1,2,min))
max_value <- max(p_new)


## [e] Combine Two data (Normal, Patient)
cc_radar <- rbind(be,p_new)
cc_radar <- cbind(group=c("Normal","Patient"),cc_radar)

coc <- melt(cc_radar, id="group")
library(stringr)
coc$variable <- as.character(coc$variable)
coc$variable[str_count(coc$variable,'\\_')>1] <- sub('_', '.', coc$variable[str_count(coc$variable,'\\_')>1])
name <- unlist(strsplit(as.character(coc$variable),"_"))

coc$feature <- name[c(seq(1,length(name),2))]
coc$type <- name[c(seq(2,length(name),2))]  
coc$variable <- NULL

df <- coc[order(coc$feature),]


## [f] Save titles : Main title, Patient Diagnosis
title <- "Breast Cancer Diagnosis Radar"
subtitle <- cancer_diagnosis_predict(new)



## ★[g] Radar plot
res <- ggplot(df, aes(x=feature,y=value,group=group,fill=group,color=group))+
    geom_point()+geom_polygon(alpha=0.3)+coord_radar()+ylim(0,max_value)+
    scale_color_manual(values=c(NA,"#b10000"))+
    scale_fill_manual(values=c("#8e8e8e",NA))+
    facet_wrap(~type)+
    theme(panel.background=element_rect(fill = "white", colour= NA),
          panel.border=element_rect(fill = NA, colour="grey50"), 
          panel.grid.major=element_line(colour = "grey90", size = 0.2),
          panel.grid.minor=element_line(colour = "grey98", size = 0.5),
          legend.position="bottom",
          strip.background =  element_rect(fill = "grey80", colour = "grey50"),
          axis.text.y=element_text(colour=NA),
          axis.title.y=element_text(colour=NA),
          axis.ticks=element_line(colour = NA))+
          xlab("")+ylab("")+
    labs(title=title, subtitle=subtitle)+
    theme(plot.title = element_text(face='bold', colour='black', hjust=0.5, size=15))+
    theme(plot.subtitle=element_text(lineheight=0.8, hjust=0.5, size=12))+
    labs(caption="[Training 569 wisc cancer diagnostic patient data]")



## [h] output graph
res

}

8-2) Testing Function

Benign

cancer_radar(B,wbcd)

Malignant

cancer_radar(M,wbcd)